Takeda NK data

Takeda data with the additional NK pool in the sample.
However, the added NK cells don’t have any tags.
I tried to classify the additional NK cells and endogenous NK cells using the ML algorithms as well as to find the features that separate them.

Note about data scaling/transformation methods

Term Description R Code Example
center Centers the data to the mean, shifting the data such that the center of each column is located at the origin by subtracting the mean value of each column. data <- scale(data, center = TRUE, scale = FALSE)
scale Standardizes the data by dividing each column by its standard deviation, adjusting the scale of the data. data <- scale(data, center = FALSE, scale = TRUE)
YeoJohnson Applies the Yeo-Johnson transformation, which is designed to maintain a normal distribution of the data. library(car); data <- YeoJohnson(data)
nzv (Near Zero Variance) Removes columns (features) with almost no variance, identifying and excluding features where the variance is close to zero. library(caret); nzv <- nearZeroVar(data); data <- data[, -nzv]

Variable features

VariableFeatures(obj.srt) %>% head()
## [1] "CCL4L2"   "HIST1H4C" "HBB"      "CXCL8"    "SPP1"     "HBA2"
cat("The number of variable features is ", length(VariableFeatures(obj.srt)), "\n")
## The number of variable features is  3000
# Replace "-" with "_"
VariableFeatures(obj.srt) <- gsub("-", "_", VariableFeatures(obj.srt))

Treatment selection

selected_treatments = c("CTL","NKC")
cells = obj.srt@meta.data[obj.srt@meta.data$sample %in% selected_treatments,] %>% rownames()
df= obj.srt@meta.data %>% filter(sample %in% selected_treatments) %>% select(c("orig.ident","sample"))
df$sample =factor(df$sample, levels = c("CTL","NKC"))
df %>% select(sample) %>% table()
## sample
## CTL NKC 
## 325 922
# df

Expression Matrix

Features : Variable features (3000)
Gene expression : normalized data not scaled (scale will be done later)
Cells : Cells from CTL and NKC samples

exp.mtx= data.frame(obj.srt@assays$RNA@data, check.names = F)[VariableFeatures(obj.srt),cells] %>% t() %>% data.frame(check.names = F)
exp.mtx %>% dim()
## [1] 1247 3000

Sample Classification

set.seed(1234)
train_prop = 0.75
cell_split= initial_split(df, prop = train_prop)
cell_training=training(cell_split)
cell_test=testing(cell_split)
cell_split
cell_training
cell_test
cell_training$sample %>% table()
cell_test$sample %>% table()
cell_training
exp.mtx
train_set <- merge(cell_training, exp.mtx, by = "row.names")
train_set[1:3,1:8]
##                    Row.names orig.ident sample   CCL4L2 HIST1H4C HBB    CXCL8
## 1 TN30293_AAACGCTGTTCAAAGA-2      30293    NKC 0.000000 0.000000   0 2.366233
## 2 TN30293_AAAGAACCAATCGCAT-2      30293    NKC 3.845169 0.000000   0 0.000000
## 3 TN30293_AAAGGTAAGATGAATC-2      30293    NKC 3.556894 4.585522   0 0.000000
##   SPP1
## 1    0
## 2    0
## 3    0
test_set <- merge(cell_test, exp.mtx, by = "row.names")
test_set[1:3,1:8]
##                    Row.names orig.ident sample   CCL4L2 HIST1H4C HBB   CXCL8
## 1 TN30293_AAACCCAAGATCCTAC-2      30293    NKC 0.000000 0.000000   0 0.00000
## 2 TN30293_AAAGAACAGCACTCGC-2      30293    NKC 0.000000 2.082862   0 0.00000
## 3 TN30293_AACAAAGCAGGAGGTT-2      30293    NKC 3.627605 0.000000   0 2.21248
##       SPP1
## 1 0.000000
## 2 0.000000
## 3 1.623178

Data preprocessing

Merge train and test set

# Merge train and test set  
tmp = rbind(train_set,test_set)
tmp %>% dim()
## [1] 1247 3003

Preprocessing data using predict() function

# Use numeric values only  
all_data = tmp[,4:ncol(tmp)]
# predict    
all_data = predict(preProcess(all_data, method = c("center", "scale", "YeoJohnson", "nzv")),all_data)

all_data[1:3,1:8]
##       CCL4L2   HIST1H4C      CXCL8      CTSL       CCL4      CENPF       SOD2
## 1 -0.5719783 -0.7074698  2.0459790 -0.316838 -1.0812885 -0.4392773  1.4017583
## 2  1.8158364 -0.7074698 -0.5411131 -0.316838  1.5401948 -0.4392773 -0.7865508
## 3  1.8112970  1.6333059 -0.5411131  2.100573  0.9865894  2.2856705 -0.7865508
##      TNFRSF9
## 1 -0.3341243
## 2 -0.3341243
## 3 -0.3341243
rownames(all_data) = tmp$Row.names
all_data$sample = tmp$sample
all_data %>% dim()
## [1] 1247  698
all_data$sample =factor(all_data$sample, levels = c("CTL","NKC"))
#all_data$sample

Split train_set and test_set

train_set_processed = all_data[train_set$Row.names,]
test_set_processed = all_data[test_set$Row.names,]

Explore dataset

tSNE

set.seed(94512)   
tsne <- Rtsne::Rtsne(all_data[,colnames(all_data)!='sample'], dims = 2, perplexity=15, verbose=TRUE, max_iter = 500)
# tsne %>% saveRDS(paste0(dir,"rds/ML.23.10.13.tsne.rds"))
# tsne$Y has tSNE X and Y information  
tsne.df = as.data.frame(tsne$Y)
rownames(tsne.df) = rownames(all_data)
colnames(tsne.df) = c('tsne.1', 'tsne.2')

tsne.df$sample = all_data$sample

ggplot(tsne.df, aes(x=tsne.1, y=tsne.2, color=sample))+
  geom_point()+
  theme_bw()

Kmeans clustering

data_for_clustering <- all_data[, colnames(all_data) != 'sample']

k <- 6  
kmeans_result <- kmeans(data_for_clustering, centers = k, nstart = 5)

# check the result
#kmeans_result$cluster  

# check the cluster center
#kmeans_result$centers  
kmeans_result %>% saveRDS(paste0(dir,"rds/ML.23.10.13.kmeans_6.rds"))
clustered_tsne_data <- data.frame(Cluster = as.factor(kmeans_result$cluster), tsne.df)
# tSNE Plot 
ggplot(clustered_tsne_data, aes(tsne.1,tsne.2, color = Cluster)) +
  geom_point() +
  labs(title = "t-SNE Clustering") +
  theme_minimal()

ggplot(clustered_tsne_data, aes(tsne.1,tsne.2, color = Cluster)) +
  geom_point() +
  labs(title = "t-SNE Clustering") +
  theme_minimal() +facet_wrap(.~Cluster, ncol = 3)

CTL-NKC Models classification

Random Forest: Train model
CTL-NKC classification

df2=train_set_processed
colnames(df2) = gsub("-", "_", colnames(df2))
sel=colnames(df2[,1:(ncol(df2)-1)]) 
# sel = sel[!(grepl("HLA", sel))]
# sel = sel[!(grepl("MT", sel))]
# sel = sel[!(grepl("EPB41L4A", sel))]

# Assuming 'sample' is the response variable and 'sel' is a character vector of predictor variables
response_var <- "sample"
predictor_vars <- sel

# Create the formula using reformulate
test_vars <- predictor_vars[1:620]  # adjust based on your actual data
formula <- as.formula(paste(response_var, "~", 
                                 paste(test_vars, collapse = "+")))

# formula <- as.formula(paste(response_var, "~", 
#                             paste(predictor_vars, collapse = "+")))
to_forest = df2

# train_control 
train_control <- trainControl(method="cv", number=10, classProbs = TRUE, summaryFunction = twoClassSummary)
# Check the data frame structure and variable names
set.seed(123)
Sys.time()
rf2 <- train(formula, data = to_forest, method = "rf", tuneLength=10, importance=T, trControl=train_control, metric = "ROC")
# Sys.time()
#rf2 %>% saveRDS(paste0(dir, "rds/ML.23.10.13.rf2.rds"))
# Define the trainControl settings with the positive class as "NKC"
# train_control 
train_control <- trainControl(method="cv", number=10, classProbs = TRUE, 
                              summaryFunction = twoClassSummary)

# Train the random forest model with the specified trainControl
rf3 <- train(formula, data = to_forest, method = "rf", tuneLength=10, importance=T, trControl=train_control, metric = "ROC", positive = "NKC")

rf3 %>% saveRDS(paste0(dir, "rds/ML.23.10.13.rf3.NKC.positive.rds"))

Read Random Foreset rds

rf2 = readRDS(paste0(dir, "rds/ML.23.10.13.rf2.rds"))
rf2 
## Random Forest 
## 
## 935 samples
## 620 predictors
##   2 classes: 'CTL', 'NKC' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 841, 842, 842, 842, 841, 841, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens         Spec     
##     2   0.8112668  0.008333333  0.9985714
##     3   0.8128744  0.012500000  0.9971429
##     7   0.8171752  0.042572464  0.9942857
##    13   0.8268715  0.046557971  0.9914286
##    25   0.8232656  0.054528986  0.9871222
##    48   0.8260115  0.092934783  0.9814079
##    91   0.8278610  0.101449275  0.9714079
##   173   0.8231108  0.114130435  0.9699793
##   327   0.8149763  0.140217391  0.9642650
##   620   0.8137224  0.148188406  0.9585507
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 91.
rf3 = readRDS(paste0(dir, "rds/ML.23.10.13.rf3.NKC.positive.rds"))
rf3 
## Random Forest 
## 
## 935 samples
## 620 predictors
##   2 classes: 'CTL', 'NKC' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 841, 841, 841, 842, 842, 841, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens         Spec     
##     2   0.8186980  0.004166667  0.9985714
##     3   0.8220146  0.029710145  0.9928571
##     7   0.8253601  0.021195652  0.9928364
##    13   0.8312241  0.034239130  0.9899793
##    25   0.8306692  0.054891304  0.9856936
##    48   0.8348781  0.076449275  0.9799793
##    91   0.8359281  0.114130435  0.9785507
##   173   0.8329670  0.139130435  0.9714079
##   327   0.8312442  0.152355072  0.9756936
##   620   0.8261274  0.190579710  0.9728364
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 91.
df3=test_set_processed
colnames(df3) = gsub("-", "_", colnames(df3))
# Test the accuracy of the final model on the test set
prediction = predict(rf2,newdata = df3)
postResample(prediction,df3$sample)
##  Accuracy     Kappa 
## 0.7403846 0.1474835
confusionMatrix(prediction,df3$sample)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction CTL NKC
##        CTL  11   3
##        NKC  78 220
##                                          
##                Accuracy : 0.7404         
##                  95% CI : (0.688, 0.7881)
##     No Information Rate : 0.7147         
##     P-Value [Acc > NIR] : 0.1738         
##                                          
##                   Kappa : 0.1475         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.12360        
##             Specificity : 0.98655        
##          Pos Pred Value : 0.78571        
##          Neg Pred Value : 0.73826        
##              Prevalence : 0.28526        
##          Detection Rate : 0.03526        
##    Detection Prevalence : 0.04487        
##       Balanced Accuracy : 0.55507        
##                                          
##        'Positive' Class : CTL            
## 

##Random Forest most important features

roc_imp_rf <- varImp(rf2, scale = FALSE)
plot(roc_imp_rf, top=30)

exp.mtx.tsne = cbind(tsne.df[rownames(exp.mtx),], exp.mtx)
exp.mtx.tsne %>% ggplot(aes(tsne.1, tsne.2, color=GZMA)) + geom_point()

var_importance <- varImp(rf3, scale = FALSE)
plot(var_importance, top=30)

Find markers based on clustering

Use all_data from above

# Use numeric values only  
all_data = tmp[,4:ncol(tmp)]
# predict  
all_data = predict(preProcess(all_data, method = c("center", "scale", "YeoJohnson", "nzv")),all_data)

all_data[1:3,1:8]
rownames(all_data) = tmp$Row.names
all_data$sample = tmp$sample
all_data %>% dim()
all_data$sample =factor(all_data$sample, levels = c("CTL","NKC"))
# I can't use DESeq2 because I only have a scaled data  
# Load DESeq2 library
library(DESeq2)

# Create a DESeqDataSet
dds <- DESeqDataSetFromMatrix(countData, colData, design = ~ cluster)

# Run differential expression analysis
dds <- DESeq(dds)

# Extract results
res <- results(dds)

# Filter for significant genes
significant_genes <- res[which(res$padj < 0.05), ]

# Get top markers for each cluster
top_markers_per_cluster <- by(significant_genes, significant_genes$cluster, function(cluster_data) {
  top_n(cluster_data, n = 10, wt = abs(log2FoldChange))
})
scaled_data= all_data[names(kmeans_result$cluster),]
scaled_data$cluster = kmeans_result$cluster 
scaled_data$cluster = factor(scaled_data$cluster)
levels(scaled_data$cluster) = c(rep("endogenous_NK",3), rep("additional_NK",3))

# Filter the data for the two clusters
cols=colnames(scaled_data)[!(colnames(scaled_data) %in% c('cluster','sample'))]
endogenous_NK_data <- scaled_data[scaled_data$cluster == "endogenous_NK", ][,colnames(scaled_data)[!(colnames(scaled_data) %in% c('cluster','sample'))]]
additional_NK_data <- scaled_data[scaled_data$cluster == "additional_NK", ][,colnames(scaled_data)[!(colnames(scaled_data) %in% c('cluster','sample'))]]
differentially_expressed_genes <- data.frame(Gene = character(0), P_Value = numeric(0), MeanDifference = numeric(0))

for (gene in cols) {
  endo_mean <- mean(endogenous_NK_data[, gene])
  add_mean <- mean(additional_NK_data[, gene])
  p_value <- wilcox.test(endogenous_NK_data[, gene], additional_NK_data[, gene])$p.value
  
  if (p_value < 0.05) {  # You can adjust the significance threshold (e.g., 0.05)
    mean_difference <- endo_mean - add_mean
    differentially_expressed_genes <- rbind(differentially_expressed_genes, 
                                            data.frame(Gene = gene, P_Value = p_value, MeanDifference = mean_difference))
  }
}
differentially_expressed_genes %>% arrange(MeanDifference) %>% head(10)
##        Gene      P_Value MeanDifference
## 1   ALOX5AP 1.329318e-89     -1.1136153
## 2      CD52 5.402229e-81     -1.1060831
## 3      CTSW 1.246408e-78     -1.0773024
## 4      CCL3 8.141245e-66     -0.9704443
## 5    CCL4L2 6.043726e-61     -0.9429490
## 6      JAML 1.139564e-62     -0.9269700
## 7      GZMA 6.287761e-56     -0.9094653
## 8  SH3BGRL3 2.415529e-57     -0.9023268
## 9     CEBPD 1.102471e-48     -0.8792903
## 10   FCER1G 7.346751e-49     -0.8531323
differentially_expressed_genes %>% filter(MeanDifference > 0.1) %>% arrange(P_Value)
##        Gene      P_Value MeanDifference
## 1     RPS18 2.003957e-74      0.9735399
## 2      RPS2 4.186645e-47      0.7800980
## 3    RPS4Y1 5.496090e-24      0.6161586
## 4    SPOCK2 4.766461e-19      0.5104610
## 5     ITGA4 9.778338e-18      0.5249422
## 6    EEF1B2 3.051736e-17      0.3639063
## 7    MALAT1 5.723633e-17      0.4500622
## 8     CXCR4 5.091805e-16      0.3945730
## 9     RPLP0 1.064833e-14      0.3834748
## 10   TMSB10 2.528810e-14      0.3793897
## 11      MIF 6.464242e-13      0.3426885
## 12     RPSA 8.527060e-13      0.3487072
## 13    DUSP2 2.785634e-11      0.4217843
## 14   GIMAP7 8.010904e-11      0.3171209
## 15    PDE4B 2.388929e-10      0.4040134
## 16     GZMH 2.898639e-08      0.2473633
## 17     NME2 2.259812e-07      0.2001636
## 18   ZNF331 4.200823e-07      0.3235077
## 19    HEBP2 6.443505e-07      0.3208917
## 20     GAS5 2.461599e-06      0.1819466
## 21     GZMM 4.559896e-06      0.3081114
## 22   PLAAT4 7.539248e-06      0.1719632
## 23     TCF7 1.361239e-05      0.2826387
## 24    ZFAS1 1.641383e-05      0.1289275
## 25     MT2A 1.092192e-04      0.1291319
## 26    NR4A2 1.407604e-04      0.2625901
## 27 HSP90AB1 1.854971e-04      0.1455816
## 28      CD7 5.884829e-04      0.1155778
## 29     TSPO 5.934343e-04      0.1083054
## 30   SNHG12 2.541000e-03      0.2285674
## 31     TUBB 2.646932e-03      0.1203508
## 32      ID3 3.987375e-03      0.2483297
## 33   TMIGD2 5.773546e-03      0.2035260
## 34    KDM6B 8.634898e-03      0.2097041
## 35     CD3G 8.877499e-03      0.1611099
## 36    PDCL3 1.349684e-02      0.1750981
## 37     CD83 1.689106e-02      0.2057964
## 38   IFITM3 2.161156e-02      0.2023153
## 39     MT1X 2.822768e-02      0.1874456
differentially_expressed_genes %>% filter(MeanDifference < - 0.5) %>% arrange(P_Value)
##          Gene      P_Value MeanDifference
## 1     ALOX5AP 1.329318e-89     -1.1136153
## 2        CD52 5.402229e-81     -1.1060831
## 3        CTSW 1.246408e-78     -1.0773024
## 4        CCL3 8.141245e-66     -0.9704443
## 5        JAML 1.139564e-62     -0.9269700
## 6      CCL4L2 6.043726e-61     -0.9429490
## 7    SH3BGRL3 2.415529e-57     -0.9023268
## 8        GZMA 6.287761e-56     -0.9094653
## 9      FCER1G 7.346751e-49     -0.8531323
## 10      CEBPD 1.102471e-48     -0.8792903
## 11       ACTB 3.451951e-46     -0.8066236
## 12       PAG1 3.930850e-46     -0.7927725
## 13       LIM2 9.898872e-45     -0.7665869
## 14       CCL5 4.836690e-44     -0.8218172
## 15 AC092821.3 1.351230e-43     -0.8283555
## 16      ARL4C 7.952528e-42     -0.8161295
## 17       SRGN 7.527401e-40     -0.7525193
## 18     IGFLR1 1.261123e-39     -0.8060718
## 19       CCL4 1.211623e-38     -0.7437139
## 20    S100A11 2.760035e-38     -0.7542336
## 21    TNFSF10 1.767410e-37     -0.7801545
## 22     SYNGR1 4.306390e-37     -0.6951116
## 23       GLRX 2.070608e-36     -0.6845145
## 24       XIST 3.421161e-36     -0.7560991
## 25      PLIN2 4.224885e-36     -0.7598861
## 26    HLA-DRA 4.644781e-36     -0.7579996
## 27      CEBPB 7.591118e-36     -0.7557146
## 28       CD82 7.650326e-36     -0.7455798
## 29      LAIR2 6.667963e-35     -0.6647056
## 30      IL2RA 9.219141e-35     -0.6868486
## 31       HOPX 2.188551e-34     -0.7622123
## 32       GZMB 6.319918e-34     -0.7231920
## 33       UCP2 1.517852e-33     -0.6644241
## 34     LGALS3 2.562817e-33     -0.7322306
## 35        MAF 3.983832e-33     -0.6602905
## 36       TPM4 7.982310e-32     -0.7059643
## 37       CTSB 3.532622e-31     -0.6330372
## 38      CLIC3 1.743288e-30     -0.6441608
## 39       CTSD 7.721000e-30     -0.7167413
## 40       CAPG 1.122687e-29     -0.7106018
## 41   HLA-DRB1 1.987496e-29     -0.7090800
## 42      RGS10 3.887982e-29     -0.6672097
## 43      MMP25 9.671099e-29     -0.5945201
## 44    HLA-DMA 2.030038e-28     -0.5937054
## 45      CPNE7 7.172844e-28     -0.5856930
## 46       CD74 1.038888e-27     -0.6471750
## 47     DRAXIN 1.496445e-27     -0.5697371
## 48   C12orf75 1.941916e-27     -0.6490073
## 49       TESC 1.006995e-26     -0.5978311
## 50       SQLE 4.863345e-26     -0.5717253
## 51       LCP1 6.693591e-26     -0.6429861
## 52       SMOX 7.679158e-26     -0.5614130
## 53        GEM 7.842933e-26     -0.5342845
## 54     CCL3L1 2.846188e-25     -0.5546991
## 55    TMEM14C 5.417853e-25     -0.5560812
## 56     HAVCR2 6.909213e-25     -0.5408489
## 57   ARHGAP18 9.171238e-25     -0.5660585
## 58     FCGR3A 4.150809e-24     -0.5120324
## 59     ENTPD1 1.916897e-23     -0.5138256
## 60       FTH1 4.967467e-23     -0.5201652
## 61   PPP1R15A 5.845569e-23     -0.6209233
## 62       IFI6 6.557328e-23     -0.5889474
## 63      CCND2 6.571843e-23     -0.6146277
## 64       BATF 6.463243e-22     -0.5116991
## 65        CD9 6.758444e-22     -0.5244289
## 66       AQP3 7.430447e-22     -0.5139439
## 67        VIM 9.786747e-22     -0.5469913
## 68    ANKRD28 1.080723e-21     -0.5041171
## 69       NCF4 1.466033e-21     -0.5264441
## 70        PGD 1.506905e-21     -0.5127455
## 71      RGS16 1.720735e-21     -0.5683245
## 72       TOX2 6.068589e-21     -0.5108969
## 73     MT-CO3 2.508376e-20     -0.5251673
## 74       LMNA 7.969750e-20     -0.5693821
## 75     SLC1A4 2.175407e-19     -0.5001452
## 76      ACTG1 2.672603e-19     -0.5414504
## 77      TIGIT 9.578690e-19     -0.5920736
## 78       BST2 1.439517e-18     -0.5561233
## 79     CORO1A 8.397181e-18     -0.5707347
## 80      HDLBP 1.907671e-17     -0.5403632
## 81      HMGB2 5.964086e-17     -0.5163635
## 82     MRPL52 3.024085e-16     -0.5104916
## 83       IER3 1.050899e-14     -0.5136831
## 84      KLRC2 2.076450e-14     -0.5011354
## 85     MYL12A 6.133341e-14     -0.5062116
differentially_expressed_genes %>% ggplot(aes(MeanDifference)) + geom_density()

differentially_expressed_genes$MeanDifference %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.1136 -0.3974 -0.2486 -0.2659 -0.1330  0.9735

NK cells names

endogenous_NK_rows <- scaled_data[scaled_data$cluster == "endogenous_NK", c('cluster','sample')] %>% rownames()
additional_NK_rows <- scaled_data[scaled_data$cluster == "additional_NK", c('cluster','sample')] %>% rownames()
obj.srt = readRDS(paste0(dir, "rds/Takeda.NK.30293.30298.NK_only.23.09.28.rds"))
obj.srt@meta.data$NK = ""
obj.srt@meta.data[endogenous_NK_rows,]$NK = "Endogenous_NK from CTL and NKC"
obj.srt@meta.data[additional_NK_rows,]$NK = "Additional_NK from NKC"
DimPlot(obj.srt, group.by = "NK", cols = c("grey","red","blue"))

DimPlot(obj.srt, group.by = "NK", cols = c("grey","red","blue"), split.by = "sample", ncol = 2)

genes = c("NKG7", "KLRC1", "KLRD1", "KLRB1")
FeaturePlot(obj.srt, genes, pt.size = 0.2)

VlnPlot(obj.srt, features = genes, group.by = "NK", stack = T, flip = F)

genes = rownames(obj.srt)[grepl("GZM", rownames(obj.srt))]
FeaturePlot(obj.srt, genes, pt.size = 0.2, ncol = 3)

VlnPlot(obj.srt, features = genes, group.by = "NK", stack = T, flip = F)

# Find markers between NKs 
g2 = "Additional_NK from NKC"
g1 = "Endogenous_NK from CTL and NKC"
logfc=log2(1)
Idents(obj.srt) = 'NK'
  logfc=log2(1)
  mks =FindMarkers(obj.srt, ident.1 = g2, ident.2 = g1, 
                   logfc.threshold = logfc)
  pval=0.05
  fc=1.2
  mks = mks %>% mutate(DE=ifelse(avg_log2FC >= log2(fc) & p_val_adj < pval, 'UP',
                                 ifelse(avg_log2FC <= -log2(fc) & p_val_adj < pval, 'DN','no_sig')))
  mks$DE = factor(mks$DE, levels = c('UP','DN','no_sig'))
  mks$gene = rownames(mks)
  mks =mks %>% mutate(labels= ifelse(DE == 'UP', gene, ifelse(DE=='DN',gene,'other')))
  mks =mks %>% arrange(desc(avg_log2FC))
  mks$gene <- rownames(mks)
mks =mks %>% mutate(labels= ifelse(DE == 'UP', gene, ifelse(DE=='DN',gene,'')))
mks =mks %>% arrange(desc(avg_log2FC))
mks %>% ggplot(aes(avg_log2FC, -log10(p_val_adj), color = DE)) + 
      geom_point(size = 1, alpha = 0.5) + 
      scale_color_manual(values = c("red",'blue', 'grey')) +
      theme_classic() +
      geom_vline(xintercept = c(-log2(fc), log2(fc)), color = 'grey') +
      geom_hline(yintercept = -log10(0.05), color = 'grey') +
      guides(colour = guide_legend(override.aes = list(size = 5))) +
  geom_text(aes(label = labels), size = 3, show.legend = FALSE, hjust = 0, nudge_x = 0.03) +
      ggeasy::easy_center_title() ## to center title

mks[mks$DE=="UP",]$gene 
##   [1] "REG4"       "NTRK2"      "UTS2"       "MB"         "DMD"       
##   [6] "SAXO2"      "CYP1B1"     "OLIG3"      "CREB3L3"    "LAIR2"     
##  [11] "CSF2RB"     "CCL4L2"     "LINC00892"  "LIM2"       "AC040970.1"
##  [16] "HHLA3"      "OTUB2"      "APBB2"      "MCEMP1"     "MSC-AS1"   
##  [21] "JAML"       "MB21D2"     "NINJ2"      "NPTX1"      "WNT11"     
##  [26] "CCL3L1"     "TJP1"       "AMZ1"       "AL139330.1" "RYR1"      
##  [31] "TFPI"       "CEACAM1"    "MYO1E"      "CLEC12A"    "ABCG1"     
##  [36] "PRSS57"     "MAL"        "STAP1"      "CD109"      "FUT7"      
##  [41] "CTLA4"      "CLECL1"     "LZTS1"      "GPR35"      "CD244"     
##  [46] "MMP25"      "DRAXIN"     "SMOX"       "SMCO4"      "DPF3"      
##  [51] "CCL3"       "RCBTB2"     "TMEM273"    "ASB2"       "PAX8-AS1"  
##  [56] "SYNGR1"     "SIGLEC7"    "SLC1A4"     "LY6G5C"     "MSC"       
##  [61] "PGLYRP2"    "HYLS1"      "NETO2"      "LPAR6"      "LRRC28"    
##  [66] "BCAR3"      "NQO1"       "PAG1"       "GPAT3"      "MAF"       
##  [71] "TGFBR3"     "CSGALNACT1" "IL1RN"      "UCP2"       "TESC"      
##  [76] "IL2RA"      "IL9R"       "PRKAR1B"    "CXorf40B"   "IKZF4"     
##  [81] "CAMK1"      "CPNE7"      "NPDC1"      "TOX2"       "EPB41L2"   
##  [86] "RUNX2"      "AQP3"       "MTSS1"      "CTSB"       "NCF4"      
##  [91] "PLK1"       "FDXR"       "TBXAS1"     "CORO1C"     "CD52"      
##  [96] "ARHGEF12"   "LAIR1"      "LDB2"       "AC017104.1" "FUT8"      
## [101] "COPZ2"      "NAPRT"      "PTPRN2"     "CAVIN3"     "PGD"       
## [106] "LUCAT1"     "TNF"        "FBXO32"     "LINC00299"  "DUSP6"     
## [111] "HLA-DMA"    "RITA1"      "PACSIN2"    "VAMP1"      "C3orf14"   
## [116] "ITGA2"      "MPZL3"      "DYRK4"      "AC092821.3" "FNIP2"     
## [121] "P2RX4"      "FCGR3A"     "ARHGAP18"   "GNA12"      "IFITM10"   
## [126] "GLRX"       "TFEB"       "BUB1"       "GEM"        "PDGFA"     
## [131] "EIPR1"      "ITGB7"      "PDLIM7"     "CTSH"       "CCDC50"    
## [136] "TST"        "ITPRIPL2"   "CEBPD"      "AGTRAP"     "FEZ1"      
## [141] "ZNF683"     "CTSW"       "PPARA"      "LINC00539"  "EMP1"      
## [146] "NCALD"      "KNL1"       "CD70"       "DPYSL2"     "CCL4"      
## [151] "RGS9"       "SYTL2"      "SQLE"       "DOCK5"      "WIPI1"     
## [156] "CD82"       "ITGAM"      "GSN"        "CEBPB"      "TPX2"      
## [161] "KIR2DL4"    "GBA"        "TRGC1"      "HLA-DRA"    "ALOX5AP"   
## [166] "FADS1"      "XIST"       "CD226"      "PRAF2"      "H2AFJ"     
## [171] "HMGB3"      "CDKN1A"     "GNGT2"      "ENTPD1"     "AC022706.1"
## [176] "CD9"        "GRAMD1A"    "MKI67"      "SESTD1"     "IFI6"      
## [181] "LTC4S"      "SLCO4A1"    "ATP6V1B2"   "TNFRSF9"    "GLIPR1"    
## [186] "IGFLR1"     "GZMA"       "INPP5F"     "CD68"       "ITGAE"     
## [191] "BATF"       "AC007384.1" "STRN3"      "TOP2A"      "SYNJ2"     
## [196] "AFDN"       "CPD"        "ETHE1"      "LGALS3"     "CCR1"      
## [201] "ZMIZ1"      "TMEM14C"    "CD59"       "RGS10"      "SLC27A2"   
## [206] "FBXO6"      "TPM4"       "STX11"      "NECTIN3"    "PIK3R6"    
## [211] "C12orf75"   "CAPN12"     "ANKRD28"    "GPALPP1"    "IER5L"     
## [216] "PTPN6"      "ASPM"       "MMP24OS"    "YPEL1"      "ORAI3"     
## [221] "CENPF"      "ADCY3"      "GALM"       "TRIM13"     "GYG1"      
## [226] "IL18RAP"    "WDR1"       "RCSD1"      "FCER1G"     "SLFN12"    
## [231] "TNFRSF1A"   "TPST2"      "CENPW"      "ZNF720"     "RGS16"     
## [236] "NCR3"       "RCAN1"      "PLIN2"      "ERI1"       "FHL3"      
## [241] "PIM1"       "APOBR"      "FDFT1"      "SNX10"      "HIST3H2A"  
## [246] "NUMB"       "CLIC3"      "TNFSF10"    "HOPX"       "SCPEP1"    
## [251] "ANTXR2"     "ARID3B"     "GPR68"      "NABP2"      "PYM1"      
## [256] "FUCA1"      "SAMHD1"     "ARHGEF3"    "TANK"       "TFPT"      
## [261] "HLA-DQA1"   "HOXB4"      "NSMCE1"     "ARL4C"      "HNRNPLL"   
## [266] "TNFSF12"    "NDFIP2"     "PPP1R15A"   "AP1S2"      "RHBDF2"    
## [271] "GRN"        "SLC1A5"     "FAM3C"      "USB1"       "GDE1"      
## [276] "KLHL42"     "SNX9"       "SEPTIN1"    "PLEK"       "TSPAN14"   
## [281] "ATF5"       "TMEM65"     "CTNNA1"     "KLRC1"      "CASP1"     
## [286] "HAVCR2"     "ECE1"       "UBALD2"     "ACADVL"     "CHST11"    
## [291] "HTATSF1"    "SQOR"       "PRKRIP1"    "CD74"       "TGFBR1"    
## [296] "SNAP29"     "ITGB2"      "LAT2"       "MIB1"       "HDGFL3"    
## [301] "CAPG"       "CSF1"       "TAGAP"      "SFXN1"      "PXK"       
## [306] "PERP"       "ZBTB43"     "TMEM141"    "FNDC3B"     "PHLDA1"    
## [311] "EZH2"       "KCNAB2"     "DUSP5"      "SH3BGRL3"   "RAB27A"    
## [316] "OAS2"       "HSH2D"      "LMNA"       "EBP"        "CTSC"      
## [321] "SEC61A1"    "TEX30"      "HLA-DRB1"   "KMT5A"      "TRAF5"     
## [326] "ARL8B"      "ICAM3"      "CTSA"       "HSDL2"      "TNFAIP8"   
## [331] "IL21R"      "OSTF1"      "HMOX1"      "PRNP"       "ZDHHC3"    
## [336] "ENO2"       "BST2"       "HLA-DMB"    "GNA15"      "CLCN3"     
## [341] "DPP4"       "BCL2L11"    "TIAM1"      "IQGAP2"     "CTSD"      
## [346] "RAB1B"      "HIPK1"      "MXRA7"      "LAPTM5"     "CCND2"     
## [351] "HMGB2"      "RB1"        "DCXR"       "SLAMF7"     "ARRB2"     
## [356] "CYTH4"      "HIPK2"      "MRPL52"     "RFLNB"      "ZAP70"     
## [361] "CITED2"     "NQO2"       "CD58"       "HDLBP"      "FAM192A"   
## [366] "NDUFB3"     "TYROBP"     "GOLIM4"     "CTBS"       "GCNT1"     
## [371] "NDUFS4"     "TRAPPC5"    "HCST"       "NCOR2"      "DNASE2"    
## [376] "ACTB"       "DUSP10"     "UTP11"      "NFIL3"      "MPV17"     
## [381] "PLD3"       "HLA-DQB1"   "QKI"        "AP1S1"      "LMNB1"     
## [386] "P2RX5"      "STOML2"     "GNPTAB"     "RAC2"       "LCP1"      
## [391] "NCAM1"      "PLEKHF1"    "NDUFA7"     "PSTPIP1"    "NAA38"     
## [396] "PRPF6"      "THUMPD3"    "TSEN54"     "SFXN3"      "ATP8B4"    
## [401] "CARHSP1"    "CD48"       "PLEC"       "EI24"       "MR1"       
## [406] "ABI3"       "SH3BP1"     "IGF2R"      "OSBPL8"     "VMA21"     
## [411] "SEPTIN11"   "VPS25"      "CHMP4B"     "NAPA"       "NUDT5"     
## [416] "FURIN"      "TRAPPC3"    "KDELR1"     "AHNAK"      "DDAH2"     
## [421] "S100A11"    "CCL5"       "MPST"       "CSGALNACT2" "SRGN"      
## [426] "MCM5"       "MTHFD2"     "XIAP"       "CLIP1"      "ARPC5"     
## [431] "MDM2"       "COMMD3"     "PPP3CA"     "COX11"      "CD151"     
## [436] "DBNL"       "DAAM1"      "WARS"       "ERCC1"      "TNFRSF1B"  
## [441] "STAT3"      "MATK"       "PRDX6"      "IFI27L2"    "AKIRIN2"   
## [446] "IFNG"       "IDH2"       "AC243829.4" "DENND1B"    "CALCOCO2"  
## [451] "VIM"        "MT-CO2"     "RHOC"       "LAMTOR2"    "SPTY2D1"   
## [456] "CD3D"       "SHMT2"      "MAP3K8"     "KLRC2"      "MOB3A"     
## [461] "ZBTB16"     "PYCARD"     "CST7"       "MT-CO1"     "DEF6"      
## [466] "TFDP2"      "MRPL28"     "PIGX"       "RNF167"     "TRIP12"    
## [471] "TIPARP"     "CHMP5"      "JOSD2"      "BAX"        "PCID2"     
## [476] "HPRT1"      "SAP30"      "SNRPB2"     "IER3"       "FERMT3"    
## [481] "ABHD2"      "STAT1"      "SPEN"       "TMC6"       "ZNHIT1"    
## [486] "CTSS"       "CMIP"       "CD96"       "FBXW11"     "SOX4"      
## [491] "GMFG"       "GZMB"       "C15orf40"   "TALDO1"     "TBC1D10C"  
## [496] "ANKH"       "PPP1R18"    "GALNT2"     "TMX4"       "CD99"      
## [501] "NDUFB1"     "ANKRD11"    "MYO9B"      "DOK2"       "DOCK8"     
## [506] "SERINC3"    "HELZ"       "FASLG"      "ANP32E"     "ZDHHC20"   
## [511] "DYNLL1"     "CD164"      "AP3D1"      "NKG7"       "RYBP"      
## [516] "TRAF3IP3"   "SECISBP2L"  "ZFP36L1"    "CD44"       "IVNS1ABP"  
## [521] "COMT"       "JARID2"     "RAB1A"      "CAPZB"      "MRPS11"    
## [526] "TXK"        "SLC6A6"     "TAF12"      "GPR171"     "FLOT1"     
## [531] "FMNL3"      "CLTC"       "ZNRD1"      "MAPRE1"     "PDE3B"     
## [536] "SAT2"       "CIAO1"      "MOB1A"      "HMOX2"      "LCK"       
## [541] "MEF2A"      "DNPH1"      "LAGE3"      "UGP2"       "RPS27L"    
## [546] "ABHD17A"    "OIP5-AS1"   "CORO1A"     "HM13"       "SPN"       
## [551] "STMN1"      "TKT"        "ATP5IF1"    "HMGN2"      "LGALS1"    
## [556] "TM9SF2"     "GUSB"       "MRPS15"     "GMCL1"      "APOL6"     
## [561] "RBM5"       "ST8SIA4"    "CBX5"       "EMB"        "MAPK1"     
## [566] "SPATS2L"    "EIF4G3"     "CHCHD5"     "RHOH"       "MAD2L2"    
## [571] "MRPS6"      "DNM2"       "LDLRAD4"    "PRKCQ-AS1"  "TIGIT"     
## [576] "SAMSN1"     "IP6K2"      "ACTG1"      "FNIP1"      "DAD1"      
## [581] "TLN1"       "MZT2B"      "INPP4A"     "ITGAL"      "RAB7A"     
## [586] "NMRK1"      "SRGAP3"     "PICALM"     "APOBEC3C"   "ABRACL"    
## [591] "DYNLRB1"    "TMCO1"      "PSMD14"     "C6orf89"    "MRPL34"    
## [596] "NKAP"       "RDX"        "LPXN"       "NCOA4"      "TEX264"    
## [601] "RIN3"       "ATP5F1A"    "ZFP91"      "SMARCA4"    "POLD4"     
## [606] "TXN2"       "COPRS"      "RAB8B"      "TOR1A"      "ETFB"      
## [611] "HIF1A"      "PRKDC"      "INTS6"      "MSI2"       "SQSTM1"    
## [616] "PLSCR1"     "TXNRD1"     "CDC42SE1"   "EIF2AK2"    "COMMD4"    
## [621] "LSP1"       "UFD1"       "SLC3A2"     "ARL3"       "CLTB"      
## [626] "UBTF"       "PSMB2"      "CHD9"       "NEMF"       "EVL"       
## [631] "MRPS7"      "MIS18BP1"   "GLO1"       "RAP1A"      "EFHD2"     
## [636] "LYN"        "FLNA"       "DPM2"       "MORF4L2"    "SRI"       
## [641] "CYTOR"      "POLR3GL"    "AOAH"       "BHLHE40"    "ANXA5"     
## [646] "SLC9A3R1"   "FAM107B"    "RNF187"     "SELENOH"    "CTNNB1"    
## [651] "HSD17B10"   "MYL6"       "SEPTIN2"    "STIP1"      "SLC16A3"   
## [656] "PSMD4"      "REEP5"      "ZBTB7A"     "DIAPH1"     "GLUD1"     
## [661] "TYMP"       "ACTR3"      "PGAM1"      "USP9X"      "ITM2A"     
## [666] "KXD1"       "RASAL3"     "CXXC5"      "UBE2V1"     "RAB11FIP1" 
## [671] "SLC5A3"     "CD2BP2"     "C7orf50"    "ATP6V0D1"   "PSAP"      
## [676] "SASH3"      "CARD16"     "DYNC1H1"    "LSM2"       "SEC14L1"   
## [681] "CSTB"       "INPP5D"     "MSN"        "TMEM243"    "SKAP1"     
## [686] "SPAG9"      "SLFN5"      "APMAP"      "FTH1"       "OTULIN"    
## [691] "ADSS"       "GRK2"       "GOPC"       "MYL12A"     "NLRC3"     
## [696] "CRIP1"      "NBL1"       "SH3BGRL"    "ATP5PB"     "C17orf49"  
## [701] "CCT8"       "ANXA2"      "TMEM258"    "RBX1"       "LINC01871" 
## [706] "DDX6"       "PARK7"      "AHCYL1"     "PRMT1"      "TSG101"    
## [711] "LAMP2"      "PSENEN"     "TMBIM6"     "RTF2"       "COPB2"     
## [716] "ASAH1"      "KLF13"      "SERF2"      "ITGB1"      "H2AFV"     
## [721] "ARPC1B"     "BSG"        "ITM2C"      "CAP1"       "CDC123"    
## [726] "TIMP1"      "GABARAPL2"  "TIPRL"      "NDFIP1"     "IFNGR1"    
## [731] "SIPA1L1"    "EIF3A"      "H3F3A"      "CKLF"       "SEPTIN9"   
## [736] "CARD19"     "UBE2F"      "NDUFC2"     "CLTA"       "SEM1"      
## [741] "GNG2"       "XCL2"       "OS9"        "OCIAD2"     "IDS"       
## [746] "ATP5F1C"    "MT-CYB"     "ANXA11"     "PPP1CA"     "TRBC1"     
## [751] "WAC"        "PGLS"       "LIMS1"      "CLIC1"      "GPX4"      
## [756] "MBD2"       "GNB2"       "CHURC1"     "PPP1R2"     "SDF4"      
## [761] "AP2M1"      "NSA2"       "MAP1LC3B"   "CFL1"       "CYTIP"     
## [766] "ACTR2"      "MT-ND4"     "JPX"        "DNAJC15"    "RCN2"      
## [771] "PCBP2"      "DGUOK"      "COMMD8"     "MYO1F"      "ARCN1"     
## [776] "GRB2"       "MT-CO3"     "COX5A"      "COX8A"      "ARL6IP5"   
## [781] "MRPL54"     "NDUFS5"     "DCTN3"      "CSNK1A1"    "C9orf16"   
## [786] "CUTA"       "PFN1"       "MEAF6"      "IL4R"       "SDCBP"     
## [791] "DNAJB6"     "ATP5MF"     "COX6B1"     "TMOD3"      "CD81"      
## [796] "UBE2M"      "MYH9"       "CEP350"     "MYADM"      "FAM49B"    
## [801] "ITM2B"      "PTPN7"      "MBNL1"      "TSTD1"      "PDAP1"     
## [806] "PSMD8"      "TNFRSF18"   "CAPNS1"     "KHDRBS1"    "ZYX"       
## [811] "PSMC4"      "CD47"       "ADAR"       "CHMP2A"     "MT-ATP6"   
## [816] "FTL"        "MACF1"      "CD63"       "COMMD6"     "BRK1"      
## [821] "BRD7"       "EIF3K"      "UBE2I"      "TCEA1"      "ACAP2"     
## [826] "RALY"       "COPE"       "HSPA1A"     "YWHAQ"      "KAT6B"     
## [831] "SMARCE1"    "BTG2"       "BOD1L1"     "NDUFB6"     "GNAI2"     
## [836] "MCL1"       "SNX3"       "SPCS1"      "ARID5A"     "ARPC3"     
## [841] "YME1L1"     "NPM1"       "CCSER2"     "ATP6V0C"    "OCIAD1"    
## [846] "MRPL33"     "AHI1"       "SOCS1"      "BLOC1S1"    "ACAA2"     
## [851] "CLK1"       "CMPK1"      "RBM25"      "SSR3"       "SRP14"     
## [856] "LASP1"      "VCP"        "COMMD7"     "IK"         "ARPC2"     
## [861] "RER1"       "HMGN3"      "ID2"        "WDR83OS"    "UQCRQ"     
## [866] "PPP2R5C"    "KPNB1"      "NOP10"      "SCAND1"     "P4HB"
mks[mks$DE=="DN",]$gene
##   [1] "RPL6"   "RPS15A" "NACA"   "RPS28"  "RPL17"  "RPL10A" "RPS7"   "RPL27" 
##   [9] "RPL37"  "MALAT1" "RPS15"  "UBA52"  "RPL11"  "RPL15"  "RPL18"  "RPL24" 
##  [17] "EEF1A1" "HLA-B"  "RPL35"  "RPL36"  "RPL3"   "RPL14"  "RPL29"  "BTG1"  
##  [25] "HLA-A"  "MIF"    "RPLP2"  "RPS3A"  "TMSB10" "RPL22"  "RPS13"  "RPS9"  
##  [33] "EEF1G"  "RPL38"  "RPL35A" "RPSA"   "RPS23"  "RPL26"  "RPS11"  "RPLP1" 
##  [41] "FAU"    "RPL8"   "RPS3"   "RPLP0"  "RPS6"   "RPL23A" "RPS21"  "RPS20" 
##  [49] "RPL34"  "RPL28"  "RPL7A"  "RPS5"   "RPS25"  "RPS27A" "RPS8"   "NME2"  
##  [57] "RPL19"  "RPS29"  "RPL31"  "RPL9"   "RPL21"  "RPL41"  "RPL13"  "RPL32" 
##  [65] "RPL30"  "RPL37A" "RPS27"  "RPL39"  "RPS14"  "RPS24"  "RPL13A" "RPS2"  
##  [73] "EEF1B2" "RPL18A" "RPS18"  "CXCR4"  "EEF1D"  "RPS12"  "RPL10"  "RPL36A"
##  [81] "RPL27A" "PIK3R1" "GIMAP7" "GZMH"   "HEBP2"  "PDE4B"  "DUSP2"  "ITGA4" 
##  [89] "ZNF331" "SPOCK2" "KLF3"   "LAYN"   "IL7R"   "IGFBP2" "RPS4Y1" "CXCR6" 
##  [97] "LAG3"   "PRKY"   "DDX3Y"  "GZMK"   "USP9Y"  "EIF1AY" "CD8B"
mks %>% DT::datatable()